{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# File Case(AOP base on decorator)\n",
    "- This is used to cache the Dataframe result, even there are multiply Dataframe, which can help to reduce the huge time in feature engineering\n",
    "- It also support to log the function time cost and parameters\n",
    "- It will create a folder **cache** in your working dir automaticlly, which is used to cache the result in this folder\n",
    "\n",
    "\n",
    "You can also get the demo from [Here](https://github.com/Flyfoxs/file_cache/blob/master/demo.ipynb), it more easy to understand.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Installation\n",
    "pip install file_cache\n",
    "\n",
    "pip install --user file_cache (install without root privilege)\n",
    "\n",
    "[pypi Link](https://pypi.org/project/file-cache/)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-12-27 11:12:36,493 util_log.py[61] DEBUG Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n",
      "2018-12-27 11:12:36,500 util_log.py[41] INFO test_cache_normal begin with(1 paras) :['Felix'], []\n",
      "2018-12-27 11:12:36,703 cache.py[29] DEBUG try to read cache from file:./cache/test_cache_normal=Felix=.h5, (h5, key:['/df_0'])\n",
      "2018-12-27 11:12:36,727 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.00 to    0.00 Mb (72.9% reduction)\n",
      "2018-12-27 11:12:36,730 reduce_mem.py[25] WARNING The return type for fun#test_cache_normal is:<class 'pandas.core.frame.DataFrame'>\n",
      "2018-12-27 11:12:36,733 util_log.py[49] INFO test_cache_normal cost    0.23 sec:(1 paras)(['Felix'], []), return:DataFrame, end \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1  2  3  4\n",
       "0  0  1  2  3  4\n",
       "1  5  6  7  8  9"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from  file_cache.cache import file_cache\n",
    "import numpy  as np\n",
    "import pandas as pd\n",
    "\n",
    "@file_cache()\n",
    "def test_cache_normal(name):\n",
    "    import time\n",
    "    import numpy  as np\n",
    "    time.sleep(3)\n",
    "    return pd.DataFrame(data= np.arange(0,10).reshape(2,5))\n",
    "\n",
    "normal_df = test_cache_normal('Felix')\n",
    "normal_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Return mulpiple DF with tuple\n",
    "Support to cache multiple DF with tuple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-12-27 11:12:36,766 util_log.py[41] INFO test_cache_tuple begin with(1 paras) :['Felix2'], []\n",
      "2018-12-27 11:12:36,783 cache.py[29] DEBUG try to read cache from file:./cache/test_cache_tuple=Felix2=.h5, (h5, key:['/df_0', '/df_1'])\n",
      "2018-12-27 11:12:36,804 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.00 to    0.00 Mb (72.9% reduction)\n",
      "2018-12-27 11:12:36,810 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.00 to    0.00 Mb (72.9% reduction)\n",
      "2018-12-27 11:12:36,814 util_log.py[49] INFO test_cache_tuple cost    0.05 sec:(1 paras)(['Felix2'], []), return:tuple, end \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    0   1   2   3   4\n",
      "0   5   6   7   8   9\n",
      "1  10  11  12  13  14 \n",
      "\n",
      "    0   1   2   3   4\n",
      "0  20  21  22  23  24\n",
      "1  25  26  27  28  29\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "from functools import lru_cache\n",
    "\n",
    "@lru_cache()\n",
    "@file_cache()\n",
    "def test_cache_tuple(name):\n",
    "    time.sleep(3)\n",
    "    df0 = pd.DataFrame(data= np.arange(5,15).reshape(2,5))\n",
    "    df1 = pd.DataFrame(data= np.arange(20,30).reshape(2,5))\n",
    "    return df0, df1\n",
    "\n",
    "df0, df1 = test_cache_tuple('Felix2')\n",
    "print(df0 , '\\n')\n",
    "print(df1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ignore the input paras, if it can not be cached\n",
    "If the input is DF or cannot be hashed, ignore the cache, run the function directly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-12-27 11:12:36,839 util_log.py[41] INFO test_cache_ignore begin with(1 paras) :['DataFrame'], []\n",
      "2018-12-27 11:12:36,845 cache.py[113] DEBUG There is DataFrame in the args\n",
      "2018-12-27 11:12:36,850 cache.py[95] WARNING Don not support cache for fn:test_cache_ignore, para:DataFrame, kw:{}\n",
      "2018-12-27 11:12:36,862 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.00 to    0.00 Mb (43.8% reduction)\n",
      "2018-12-27 11:12:36,863 reduce_mem.py[25] WARNING The return type for fun#test_cache_ignore is:<class 'pandas.core.frame.DataFrame'>\n",
      "2018-12-27 11:12:36,867 util_log.py[49] INFO test_cache_ignore cost    0.03 sec:(1 paras)(['DataFrame'], []), return:DataFrame, end \n"
     ]
    }
   ],
   "source": [
    "@file_cache()\n",
    "def test_cache_ignore(name):\n",
    "    df0 = pd.DataFrame(data= np.arange(5,15).reshape(2,5))\n",
    "    return df0\n",
    "\n",
    "df = pd.DataFrame(data= np.arange(5,15).reshape(2,5))\n",
    "ignore = test_cache_ignore(df)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Log the function time and parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-12-27 11:12:36,879 util_log.py[41] INFO log_time begin with(1 paras) :['hello'], []\n",
      "2018-12-27 11:12:36,883 util_log.py[49] INFO log_time cost    0.00 sec:(1 paras)(['hello'], []), return:hello msg, end \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello msg\n"
     ]
    }
   ],
   "source": [
    "from file_cache.utils.util_log import *\n",
    "@timed()\n",
    "def log_time(arg):\n",
    "    return f'{arg} msg'\n",
    "\n",
    "print(log_time(\"hello\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Not only support DataFrame, but also support Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-12-27 11:21:42,935 util_log.py[61] DEBUG Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n",
      "2018-12-27 11:21:42,941 util_log.py[41] INFO get_train_data begin with(0 paras) :[], []\n",
      "2018-12-27 11:21:43,067 cache.py[29] DEBUG Read cache from file:./cache/get_train_data==.h5,key:['/df_0', '/df_1']\n",
      "2018-12-27 11:21:43,098 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.06 to    0.02 Mb (70.0% reduction)\n",
      "2018-12-27 11:21:43,101 util_log.py[49] INFO get_train_data cost    0.16 sec:(0 paras)([], []), return:tuple, end \n",
      "2018-12-27 11:21:43,103 util_log.py[41] INFO get_train_data begin with(0 paras) :[], []\n",
      "2018-12-27 11:21:43,115 cache.py[29] DEBUG Read cache from file:./cache/get_train_data==.h5,key:['/df_0', '/df_1']\n",
      "2018-12-27 11:21:43,142 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.06 to    0.02 Mb (70.0% reduction)\n",
      "2018-12-27 11:21:43,145 util_log.py[49] INFO get_train_data cost    0.04 sec:(0 paras)([], []), return:tuple, end \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>\n",
      "<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>\n"
     ]
    }
   ],
   "source": [
    "from  file_cache.cache import file_cache\n",
    "@file_cache()\n",
    "def get_train_data():\n",
    "    from sklearn import datasets\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "    data = datasets.load_boston()\n",
    "    df = pd.DataFrame( data.data , columns=data.feature_names)\n",
    "    df['target'] = data.target\n",
    "    df.head()\n",
    "    return df, df['target']\n",
    "\n",
    "df, series = get_train_data()\n",
    "print(type(df), type(series))\n",
    "\n",
    "df, series = get_train_data()\n",
    "print(type(df), type(series))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reduce 70+% memory cost for pandas object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2018-12-27 11:25:32,797 reduce_mem.py[63] DEBUG Mem. usage decreased from    0.05 to    0.01 Mb (74.9% reduction)\n",
      "2018-12-27 11:25:32,801 reduce_mem.py[25] WARNING The return type for fun#get_train_data is:<class 'pandas.core.frame.DataFrame'>\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original type:\n",
      "CRIM       float64\n",
      "ZN         float64\n",
      "INDUS      float64\n",
      "CHAS       float64\n",
      "NOX        float64\n",
      "RM         float64\n",
      "AGE        float64\n",
      "DIS        float64\n",
      "RAD        float64\n",
      "TAX        float64\n",
      "PTRATIO    float64\n",
      "B          float64\n",
      "LSTAT      float64\n",
      "dtype: object\n",
      "New type:\n",
      "CRIM       float16\n",
      "ZN         float16\n",
      "INDUS      float16\n",
      "CHAS       float16\n",
      "NOX        float16\n",
      "RM         float16\n",
      "AGE        float16\n",
      "DIS        float16\n",
      "RAD        float16\n",
      "TAX        float16\n",
      "PTRATIO    float16\n",
      "B          float16\n",
      "LSTAT      float16\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "from file_cache.utils.reduce_mem import reduce_mem\n",
    "from file_cache.utils.util_log import logger\n",
    "@reduce_mem()\n",
    "def get_train_data():\n",
    "    from sklearn import datasets\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "    data = datasets.load_boston()\n",
    "    df = pd.DataFrame( data.data , columns=data.feature_names)\n",
    "    df.head()\n",
    "    print(f'Original type:\\n{df.dtypes}')\n",
    "    return df \n",
    "\n",
    "df = get_train_data()\n",
    "\n",
    "print(f'New type:\\n{df.dtypes}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
